1 IMPORT LIBRAIRIES

library(ggplot2)
library(tidyverse)
library(dplyr)
library(Seurat)
library(SingleCellExperiment)
library(scater)
library(SCpubr)

setwd("~/Projects/HumanThymusProject/")
source("~/Projects/HumanThymusProject/scripts/colors_universal.R")

2 HUMAN THYMUS ATLAS

2.1 Import data

The Human Thymus Atlas was downloaded as a h5ad file from cellxgene, the one containing 255,901 cells.

# import h5ad object downloaded
sce_human_cellgene <- zellkonverter::readH5AD("~/Projects/HumanThymusProject/data_github/park_dataset/c6e08ab6-ab3b-41dc-8058-8e6442e081ec.h5ad")
print(sce_human_cellgene)
## class: SingleCellExperiment 
## dim: 32839 255901 
## metadata(5): cell_type_ontology_term_id_colors citation
##   schema_reference schema_version title
## assays(1): X
## rownames(32839): ENSG00000000003 ENSG00000000005 ... ENSG00000283118
##   ENSG00000283125
## rowData names(5): feature_is_filtered feature_name feature_reference
##   feature_biotype feature_length
## colnames(255901): FCAImmP7179369-AAACCTGAGCCCAATT
##   FCAImmP7179369-AAACCTGAGCCTATGT ...
##   Human_colon_16S7985397-TTTGTCAAGCTGAACG
##   Human_colon_16S7985397-TTTGTCAGTATTAGCC
## colData names(30): assay_ontology_term_id cell_type_ontology_term_id
##   ... development_stage observation_joinid
## reducedDimNames(1): X_umap
## mainExpName: NULL
## altExpNames(0):
# check out metadata
colData(sce_human_cellgene)
## DataFrame with 255901 rows and 30 columns
##                                         assay_ontology_term_id
##                                                       <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                    EFO:0009899
## FCAImmP7179369-AAACCTGAGCCTATGT                    EFO:0009899
## FCAImmP7179369-AAACCTGAGTCGCCGT                    EFO:0009899
## FCAImmP7179369-AAACCTGCAGCATGAG                    EFO:0009899
## FCAImmP7179369-AAACCTGGTCTCCCTA                    EFO:0009899
## ...                                                        ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC            EFO:0011025
## Human_colon_16S7985397-TTTGGTTCATGGTAGG            EFO:0011025
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA            EFO:0011025
## Human_colon_16S7985397-TTTGTCAAGCTGAACG            EFO:0011025
## Human_colon_16S7985397-TTTGTCAGTATTAGCC            EFO:0011025
##                                         cell_type_ontology_term_id
##                                                           <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                         CL:0000625
## FCAImmP7179369-AAACCTGAGCCTATGT                         CL:0000624
## FCAImmP7179369-AAACCTGAGTCGCCGT                         CL:0000809
## FCAImmP7179369-AAACCTGCAGCATGAG                         CL:0000625
## FCAImmP7179369-AAACCTGGTCTCCCTA                         CL:0000915
## ...                                                            ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC                 CL:0000809
## Human_colon_16S7985397-TTTGGTTCATGGTAGG                 CL:0000809
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA                 CL:0000809
## Human_colon_16S7985397-TTTGTCAAGCTGAACG                 CL:0000809
## Human_colon_16S7985397-TTTGTCAGTATTAGCC                 CL:0000235
##                                         development_stage_ontology_term_id
##                                                                   <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                             HsapDv:0000053
## FCAImmP7179369-AAACCTGAGCCTATGT                             HsapDv:0000053
## FCAImmP7179369-AAACCTGAGTCGCCGT                             HsapDv:0000053
## FCAImmP7179369-AAACCTGCAGCATGAG                             HsapDv:0000053
## FCAImmP7179369-AAACCTGGTCTCCCTA                             HsapDv:0000053
## ...                                                                    ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC                     HsapDv:0000047
## Human_colon_16S7985397-TTTGGTTCATGGTAGG                     HsapDv:0000047
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA                     HsapDv:0000047
## Human_colon_16S7985397-TTTGTCAAGCTGAACG                     HsapDv:0000047
## Human_colon_16S7985397-TTTGTCAGTATTAGCC                     HsapDv:0000047
##                                         disease_ontology_term_id
##                                                         <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                     PATO:0000461
## FCAImmP7179369-AAACCTGAGCCTATGT                     PATO:0000461
## FCAImmP7179369-AAACCTGAGTCGCCGT                     PATO:0000461
## FCAImmP7179369-AAACCTGCAGCATGAG                     PATO:0000461
## FCAImmP7179369-AAACCTGGTCTCCCTA                     PATO:0000461
## ...                                                          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC             PATO:0000461
## Human_colon_16S7985397-TTTGGTTCATGGTAGG             PATO:0000461
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA             PATO:0000461
## Human_colon_16S7985397-TTTGTCAAGCTGAACG             PATO:0000461
## Human_colon_16S7985397-TTTGTCAGTATTAGCC             PATO:0000461
##                                         self_reported_ethnicity_ontology_term_id
##                                                                         <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                                          unknown
## FCAImmP7179369-AAACCTGAGCCTATGT                                          unknown
## FCAImmP7179369-AAACCTGAGTCGCCGT                                          unknown
## FCAImmP7179369-AAACCTGCAGCATGAG                                          unknown
## FCAImmP7179369-AAACCTGGTCTCCCTA                                          unknown
## ...                                                                          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC                                  unknown
## Human_colon_16S7985397-TTTGGTTCATGGTAGG                                  unknown
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA                                  unknown
## Human_colon_16S7985397-TTTGTCAAGCTGAACG                                  unknown
## Human_colon_16S7985397-TTTGTCAGTATTAGCC                                  unknown
##                                         is_primary_data
##                                               <logical>
## FCAImmP7179369-AAACCTGAGCCCAATT                    TRUE
## FCAImmP7179369-AAACCTGAGCCTATGT                    TRUE
## FCAImmP7179369-AAACCTGAGTCGCCGT                    TRUE
## FCAImmP7179369-AAACCTGCAGCATGAG                    TRUE
## FCAImmP7179369-AAACCTGGTCTCCCTA                    TRUE
## ...                                                 ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC            TRUE
## Human_colon_16S7985397-TTTGGTTCATGGTAGG            TRUE
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA            TRUE
## Human_colon_16S7985397-TTTGTCAAGCTGAACG            TRUE
## Human_colon_16S7985397-TTTGTCAGTATTAGCC            TRUE
##                                         organism_ontology_term_id
##                                                          <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                    NCBITaxon:9606
## FCAImmP7179369-AAACCTGAGCCTATGT                    NCBITaxon:9606
## FCAImmP7179369-AAACCTGAGTCGCCGT                    NCBITaxon:9606
## FCAImmP7179369-AAACCTGCAGCATGAG                    NCBITaxon:9606
## FCAImmP7179369-AAACCTGGTCTCCCTA                    NCBITaxon:9606
## ...                                                           ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC            NCBITaxon:9606
## Human_colon_16S7985397-TTTGGTTCATGGTAGG            NCBITaxon:9606
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA            NCBITaxon:9606
## Human_colon_16S7985397-TTTGTCAAGCTGAACG            NCBITaxon:9606
## Human_colon_16S7985397-TTTGTCAGTATTAGCC            NCBITaxon:9606
##                                         sex_ontology_term_id
##                                                     <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                 PATO:0000384
## FCAImmP7179369-AAACCTGAGCCTATGT                 PATO:0000384
## FCAImmP7179369-AAACCTGAGTCGCCGT                 PATO:0000384
## FCAImmP7179369-AAACCTGCAGCATGAG                 PATO:0000384
## FCAImmP7179369-AAACCTGGTCTCCCTA                 PATO:0000384
## ...                                                      ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC         PATO:0000384
## Human_colon_16S7985397-TTTGGTTCATGGTAGG         PATO:0000384
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA         PATO:0000384
## Human_colon_16S7985397-TTTGTCAAGCTGAACG         PATO:0000384
## Human_colon_16S7985397-TTTGTCAGTATTAGCC         PATO:0000384
##                                         tissue_ontology_term_id
##                                                        <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                  UBERON:0002370
## FCAImmP7179369-AAACCTGAGCCTATGT                  UBERON:0002370
## FCAImmP7179369-AAACCTGAGTCGCCGT                  UBERON:0002370
## FCAImmP7179369-AAACCTGCAGCATGAG                  UBERON:0002370
## FCAImmP7179369-AAACCTGGTCTCCCTA                  UBERON:0002370
## ...                                                         ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC          UBERON:0002370
## Human_colon_16S7985397-TTTGGTTCATGGTAGG          UBERON:0002370
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA          UBERON:0002370
## Human_colon_16S7985397-TTTGTCAAGCTGAACG          UBERON:0002370
## Human_colon_16S7985397-TTTGTCAGTATTAGCC          UBERON:0002370
##                                                    Sample  n_counts   n_genes
##                                                  <factor> <numeric> <numeric>
## FCAImmP7179369-AAACCTGAGCCCAATT                F21_TH_45P      8738      1898
## FCAImmP7179369-AAACCTGAGCCTATGT                F21_TH_45P      3627      1210
## FCAImmP7179369-AAACCTGAGTCGCCGT                F21_TH_45P     14187      3153
## FCAImmP7179369-AAACCTGCAGCATGAG                F21_TH_45P     12309      2387
## FCAImmP7179369-AAACCTGGTCTCCCTA                F21_TH_45P      9128      2439
## ...                                                   ...       ...       ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC F74_TH_TOT_5GEX_2     16151      4158
## Human_colon_16S7985397-TTTGGTTCATGGTAGG F74_TH_TOT_5GEX_2      5315      1929
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA F74_TH_TOT_5GEX_2      5386      2084
## Human_colon_16S7985397-TTTGTCAAGCTGAACG F74_TH_TOT_5GEX_2      5096      1735
## Human_colon_16S7985397-TTTGTCAGTATTAGCC F74_TH_TOT_5GEX_2      3335      1187
##                                         donor_id     sort   method
##                                         <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT              F21      45P     3GEX
## FCAImmP7179369-AAACCTGAGCCTATGT              F21      45P     3GEX
## FCAImmP7179369-AAACCTGAGTCGCCGT              F21      45P     3GEX
## FCAImmP7179369-AAACCTGCAGCATGAG              F21      45P     3GEX
## FCAImmP7179369-AAACCTGGTCTCCCTA              F21      45P     3GEX
## ...                                          ...      ...      ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC      F74      TOT     5GEX
## Human_colon_16S7985397-TTTGGTTCATGGTAGG      F74      TOT     5GEX
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA      F74      TOT     5GEX
## Human_colon_16S7985397-TTTGTCAAGCTGAACG      F74      TOT     5GEX
## Human_colon_16S7985397-TTTGTCAGTATTAGCC      F74      TOT     5GEX
##                                                           file      mito
##                                                       <factor> <numeric>
## FCAImmP7179369-AAACCTGAGCCCAATT                 FCAImmP7179369 0.0215152
## FCAImmP7179369-AAACCTGAGCCTATGT                 FCAImmP7179369 0.0308795
## FCAImmP7179369-AAACCTGAGTCGCCGT                 FCAImmP7179369 0.0225559
## FCAImmP7179369-AAACCTGCAGCATGAG                 FCAImmP7179369 0.0279470
## FCAImmP7179369-AAACCTGGTCTCCCTA                 FCAImmP7179369 0.0256354
## ...                                                        ...       ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC Human_colon_16S7985397 0.0191319
## Human_colon_16S7985397-TTTGGTTCATGGTAGG Human_colon_16S7985397 0.0180621
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA Human_colon_16S7985397 0.0298923
## Human_colon_16S7985397-TTTGTCAAGCTGAACG Human_colon_16S7985397 0.0141287
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Human_colon_16S7985397 0.0083958
##                                         doublet_scores predicted_doublets
##                                              <numeric>          <logical>
## FCAImmP7179369-AAACCTGAGCCCAATT              0.2093023              FALSE
## FCAImmP7179369-AAACCTGAGCCTATGT              0.1118421              FALSE
## FCAImmP7179369-AAACCTGAGTCGCCGT              0.0433071              FALSE
## FCAImmP7179369-AAACCTGCAGCATGAG              0.1118421              FALSE
## FCAImmP7179369-AAACCTGGTCTCCCTA              0.1610169              FALSE
## ...                                                ...                ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC      0.4117647              FALSE
## Human_colon_16S7985397-TTTGGTTCATGGTAGG      0.0497738              FALSE
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA      0.0556439              FALSE
## Human_colon_16S7985397-TTTGTCAAGCTGAACG      0.0556439              FALSE
## Human_colon_16S7985397-TTTGTCAGTATTAGCC      0.1176471              FALSE
##                                         suspension_type tissue_type
##                                                <factor>    <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                    cell      tissue
## FCAImmP7179369-AAACCTGAGCCTATGT                    cell      tissue
## FCAImmP7179369-AAACCTGAGTCGCCGT                    cell      tissue
## FCAImmP7179369-AAACCTGCAGCATGAG                    cell      tissue
## FCAImmP7179369-AAACCTGGTCTCCCTA                    cell      tissue
## ...                                                 ...         ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC            cell      tissue
## Human_colon_16S7985397-TTTGGTTCATGGTAGG            cell      tissue
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA            cell      tissue
## Human_colon_16S7985397-TTTGTCAAGCTGAACG            cell      tissue
## Human_colon_16S7985397-TTTGTCAGTATTAGCC            cell      tissue
##                                                                                           cell_type
##                                                                                            <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT         CD8-positive, alpha-beta T cell                            
## FCAImmP7179369-AAACCTGAGCCTATGT         CD4-positive, alpha-beta T cell                            
## FCAImmP7179369-AAACCTGAGTCGCCGT         double-positive, alpha-beta thymocyte                      
## FCAImmP7179369-AAACCTGCAGCATGAG         CD8-positive, alpha-beta T cell                            
## FCAImmP7179369-AAACCTGGTCTCCCTA         CD8-alpha-alpha-positive, alpha-beta intraepithelial T cell
## ...                                                                                             ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC                       double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGGTTCATGGTAGG                       double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA                       double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGTCAAGCTGAACG                       double-positive, alpha-beta thymocyte
## Human_colon_16S7985397-TTTGTCAGTATTAGCC                       macrophage                           
##                                             assay  disease     organism
##                                          <factor> <factor>     <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT         10x 3' v2   normal Homo sapiens
## FCAImmP7179369-AAACCTGAGCCTATGT         10x 3' v2   normal Homo sapiens
## FCAImmP7179369-AAACCTGAGTCGCCGT         10x 3' v2   normal Homo sapiens
## FCAImmP7179369-AAACCTGCAGCATGAG         10x 3' v2   normal Homo sapiens
## FCAImmP7179369-AAACCTGGTCTCCCTA         10x 3' v2   normal Homo sapiens
## ...                                           ...      ...          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC 10x 5' v1   normal Homo sapiens
## Human_colon_16S7985397-TTTGGTTCATGGTAGG 10x 5' v1   normal Homo sapiens
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA 10x 5' v1   normal Homo sapiens
## Human_colon_16S7985397-TTTGTCAAGCTGAACG 10x 5' v1   normal Homo sapiens
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 10x 5' v1   normal Homo sapiens
##                                              sex   tissue
##                                         <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT             male   thymus
## FCAImmP7179369-AAACCTGAGCCTATGT             male   thymus
## FCAImmP7179369-AAACCTGAGTCGCCGT             male   thymus
## FCAImmP7179369-AAACCTGCAGCATGAG             male   thymus
## FCAImmP7179369-AAACCTGGTCTCCCTA             male   thymus
## ...                                          ...      ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC     male   thymus
## Human_colon_16S7985397-TTTGGTTCATGGTAGG     male   thymus
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA     male   thymus
## Human_colon_16S7985397-TTTGTCAAGCTGAACG     male   thymus
## Human_colon_16S7985397-TTTGTCAGTATTAGCC     male   thymus
##                                         self_reported_ethnicity
##                                                        <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                         unknown
## FCAImmP7179369-AAACCTGAGCCTATGT                         unknown
## FCAImmP7179369-AAACCTGAGTCGCCGT                         unknown
## FCAImmP7179369-AAACCTGCAGCATGAG                         unknown
## FCAImmP7179369-AAACCTGGTCTCCCTA                         unknown
## ...                                                         ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC                 unknown
## Human_colon_16S7985397-TTTGGTTCATGGTAGG                 unknown
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA                 unknown
## Human_colon_16S7985397-TTTGTCAAGCTGAACG                 unknown
## Human_colon_16S7985397-TTTGTCAGTATTAGCC                 unknown
##                                                                development_stage
##                                                                         <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT         16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGAGCCTATGT         16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGAGTCGCCGT         16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGCAGCATGAG         16th week post-fertilization human stage
## FCAImmP7179369-AAACCTGGTCTCCCTA         16th week post-fertilization human stage
## ...                                                                          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGGTTCATGGTAGG 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGTCAAGCTGAACG 10th week post-fertilization human stage
## Human_colon_16S7985397-TTTGTCAGTATTAGCC 10th week post-fertilization human stage
##                                         observation_joinid
##                                                <character>
## FCAImmP7179369-AAACCTGAGCCCAATT                 X+W(sH{JAl
## FCAImmP7179369-AAACCTGAGCCTATGT                 xsX~{(Sp*z
## FCAImmP7179369-AAACCTGAGTCGCCGT                 <6<|6YaXP>
## FCAImmP7179369-AAACCTGCAGCATGAG                 YP@l>cc;QS
## FCAImmP7179369-AAACCTGGTCTCCCTA                 nM;U|cg{S^
## ...                                                    ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC         `K_xYCtC9f
## Human_colon_16S7985397-TTTGGTTCATGGTAGG         r8Nu6Wu^p#
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA         Th}*MaCPmt
## Human_colon_16S7985397-TTTGTCAAGCTGAACG         O-ymHp#;{~
## Human_colon_16S7985397-TTTGTCAGTATTAGCC         2JdjrhlIRC
# check umap and cell type annotation
plotReducedDim(sce_human_cellgene, dimred="X_umap", colour_by="cell_type")

# check out gene names
rowData(sce_human_cellgene)
## DataFrame with 32839 rows and 5 columns
##                 feature_is_filtered      feature_name feature_reference
##                           <logical>          <factor>          <factor>
## ENSG00000000003               FALSE          TSPAN6      NCBITaxon:9606
## ENSG00000000005               FALSE          TNMD        NCBITaxon:9606
## ENSG00000000419               FALSE          DPM1        NCBITaxon:9606
## ENSG00000000457               FALSE          SCYL3       NCBITaxon:9606
## ENSG00000000460               FALSE          C1orf112    NCBITaxon:9606
## ...                             ...               ...               ...
## ENSG00000283096               FALSE ENSG00000283096.1    NCBITaxon:9606
## ENSG00000283103               FALSE ENSG00000283103.5    NCBITaxon:9606
## ENSG00000283117               FALSE MGC4859              NCBITaxon:9606
## ENSG00000283118               FALSE ENSG00000283118.1    NCBITaxon:9606
## ENSG00000283125               FALSE ENSG00000283125.1    NCBITaxon:9606
##                 feature_biotype feature_length
##                        <factor>       <factor>
## ENSG00000000003            gene           4530
## ENSG00000000005            gene           1476
## ENSG00000000419            gene           9276
## ENSG00000000457            gene           6883
## ENSG00000000460            gene           5970
## ...                         ...            ...
## ENSG00000283096            gene           1259
## ENSG00000283103            gene           4585
## ENSG00000283117            gene           3118
## ENSG00000283118            gene           644 
## ENSG00000283125            gene           547
table(duplicated(rowData(sce_human_cellgene)$feature_name), useNA="ifany") # none of the gene symbols are duplicated (ideal :)
## 
## FALSE 
## 32839

The cell annotation doesn’t correspond to Figure 1 of the Park et al. paper. We are looking for different annotation levels such as the ones on the cellatlas.io website. We downloaded the h5ad file from cellatlas.io, let’s import it.

# import h5ad file from cellatlas.io
sce_human_cellatlas <- zellkonverter::readH5AD("~/Projects/HumanThymusProject/data_github/park_dataset/HTA08_v01_A05_Science_human_fig1.h5ad")
print(sce_human_cellatlas)
## class: SingleCellExperiment 
## dim: 33694 255901 
## metadata(0):
## assays(1): X
## rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
## rowData names(0):
## colnames(255901): FCAImmP7179369-AAACCTGAGCCCAATT
##   FCAImmP7179369-AAACCTGAGCCTATGT ...
##   Human_colon_16S7985397-TTTGTCAAGCTGAACG
##   Human_colon_16S7985397-TTTGTCAGTATTAGCC
## colData names(16): Anno_level_1 Anno_level_2 ... Gender Source
## reducedDimNames(1): X_umap
## mainExpName: NULL
## altExpNames(0):
# check out metadata
colData(sce_human_cellatlas) # much better!
## DataFrame with 255901 rows and 16 columns
##                                         Anno_level_1 Anno_level_2 Anno_level_3
##                                             <factor>     <factor>     <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                    T           SP     T_naive 
## FCAImmP7179369-AAACCTGAGCCTATGT                    T           SP     T_naive 
## FCAImmP7179369-AAACCTGAGTCGCCGT                    T           DP     DP      
## FCAImmP7179369-AAACCTGCAGCATGAG                    T           SP     T_naive 
## FCAImmP7179369-AAACCTGGTCTCCCTA                    T           SP     CD8αα(I)
## ...                                              ...          ...          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC      T           DP                DP 
## Human_colon_16S7985397-TTTGGTTCATGGTAGG      T           DP                DP 
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA      T           DP                DP 
## Human_colon_16S7985397-TTTGTCAAGCTGAACG      T           DP                DP 
## Human_colon_16S7985397-TTTGTCAGTATTAGCC      Myeloid     Mac/Mono          Mac
##                                         Anno_level_4 Anno_level_5
##                                             <factor>     <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT             CD8+T        CD8+T   
## FCAImmP7179369-AAACCTGAGCCTATGT             CD4+T        CD4+T   
## FCAImmP7179369-AAACCTGAGTCGCCGT             DP           DP(P)   
## FCAImmP7179369-AAACCTGCAGCATGAG             CD8+T        CD8+T   
## FCAImmP7179369-AAACCTGGTCTCCCTA             CD8αα(I)     CD8αα(I)
## ...                                              ...          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC          DP         DP(P)
## Human_colon_16S7985397-TTTGGTTCATGGTAGG          DP         DP(Q)
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA          DP         DP(P)
## Human_colon_16S7985397-TTTGTCAAGCTGAACG          DP         DP(Q)
## Human_colon_16S7985397-TTTGTCAGTATTAGCC          Mac        Mac  
##                                         Anno_level_fig1            Sample
##                                                <factor>          <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                   CD8+T        F21_TH_45P
## FCAImmP7179369-AAACCTGAGCCTATGT                   CD4+T        F21_TH_45P
## FCAImmP7179369-AAACCTGAGTCGCCGT                   DP           F21_TH_45P
## FCAImmP7179369-AAACCTGCAGCATGAG                   CD8+T        F21_TH_45P
## FCAImmP7179369-AAACCTGGTCTCCCTA                   CD8αα        F21_TH_45P
## ...                                                 ...               ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC             DP  F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGGTTCATGGTAGG             DP  F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA             DP  F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGTCAAGCTGAACG             DP  F74_TH_TOT_5GEX_2
## Human_colon_16S7985397-TTTGTCAGTATTAGCC             Mac F74_TH_TOT_5GEX_2
##                                            donor    organ     sort   method
##                                         <factor> <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT              F21       TH      45P     3GEX
## FCAImmP7179369-AAACCTGAGCCTATGT              F21       TH      45P     3GEX
## FCAImmP7179369-AAACCTGAGTCGCCGT              F21       TH      45P     3GEX
## FCAImmP7179369-AAACCTGCAGCATGAG              F21       TH      45P     3GEX
## FCAImmP7179369-AAACCTGGTCTCCCTA              F21       TH      45P     3GEX
## ...                                          ...      ...      ...      ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC      F74       TH      TOT     5GEX
## Human_colon_16S7985397-TTTGGTTCATGGTAGG      F74       TH      TOT     5GEX
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA      F74       TH      TOT     5GEX
## Human_colon_16S7985397-TTTGTCAAGCTGAACG      F74       TH      TOT     5GEX
## Human_colon_16S7985397-TTTGTCAGTATTAGCC      F74       TH      TOT     5GEX
##                                                           file   Anno_stage
##                                                       <factor>     <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT                 FCAImmP7179369 CD8+T_middle
## FCAImmP7179369-AAACCTGAGCCTATGT                 FCAImmP7179369 CD4+T_middle
## FCAImmP7179369-AAACCTGAGTCGCCGT                 FCAImmP7179369 DP_middle   
## FCAImmP7179369-AAACCTGCAGCATGAG                 FCAImmP7179369 CD8+T_middle
## FCAImmP7179369-AAACCTGGTCTCCCTA                 FCAImmP7179369 CD8αα_middle
## ...                                                        ...          ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC Human_colon_16S7985397    DP_early 
## Human_colon_16S7985397-TTTGGTTCATGGTAGG Human_colon_16S7985397    DP_early 
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA Human_colon_16S7985397    DP_early 
## Human_colon_16S7985397-TTTGTCAAGCTGAACG Human_colon_16S7985397    DP_early 
## Human_colon_16S7985397-TTTGTCAGTATTAGCC Human_colon_16S7985397    Mac_early
##                                              Age   Gender   Source
##                                         <factor> <factor> <factor>
## FCAImmP7179369-AAACCTGAGCCCAATT              16w     Male     HDBR
## FCAImmP7179369-AAACCTGAGCCTATGT              16w     Male     HDBR
## FCAImmP7179369-AAACCTGAGTCGCCGT              16w     Male     HDBR
## FCAImmP7179369-AAACCTGCAGCATGAG              16w     Male     HDBR
## FCAImmP7179369-AAACCTGGTCTCCCTA              16w     Male     HDBR
## ...                                          ...      ...      ...
## Human_colon_16S7985397-TTTGGTTCAAACCTAC      10w     Male     HDBR
## Human_colon_16S7985397-TTTGGTTCATGGTAGG      10w     Male     HDBR
## Human_colon_16S7985397-TTTGGTTTCAAGGTAA      10w     Male     HDBR
## Human_colon_16S7985397-TTTGTCAAGCTGAACG      10w     Male     HDBR
## Human_colon_16S7985397-TTTGTCAGTATTAGCC      10w     Male     HDBR
# check umap and cell type annotation
plotReducedDim(sce_human_cellatlas, dimred="X_umap", colour_by="Anno_level_fig1")

# check out gene names
# rowData(sce_human_cellatlas) # rowData is empty
table(duplicated(rownames(sce_human_cellatlas)), useNA="ifany") # some gene symbols are duplicated (not ideal...)
## 
## FALSE  TRUE 
## 33660    34

In summary:

  • h5ad file from cellxgene contains more information on feature names (with unique gene symbols);
  • h5ad file from cellatlas.io contains more/better cell metadata information (with correct clustering annotation).

We will combine all we need into a new SingleCellExperiment object, and then convert it into a Seurat object.

2.2 Create Seurat object

First, let’s create a new SingleCellExperiment object.

# check that the cell IDs are the same from both sources
table(colnames(sce_human_cellgene) == colnames(sce_human_cellatlas), useNA="ifany")
## 
##   TRUE 
## 255901
# table(rownames(colData(sce_human_cellgene)) == rownames(colData(sce_human_cellatlas)), useNA="ifany")

# check that umap coordinates are the same from both sources
table(reducedDim(sce_human_cellgene, "X_umap") == reducedDim(sce_human_cellatlas, "X_umap"), useNA="ifany")
## 
##   TRUE 
## 511802
# create new SCE object
sce_human_clean <- SingleCellExperiment(
  list(counts=assay(sce_human_cellgene)),
  colData=colData(sce_human_cellatlas),
  rowData=rowData(sce_human_cellgene),
  reducedDims=list(umap=reducedDim(sce_human_cellgene, "X_umap"))
)
print(sce_human_clean)
## class: SingleCellExperiment 
## dim: 32839 255901 
## metadata(0):
## assays(1): counts
## rownames(32839): ENSG00000000003 ENSG00000000005 ... ENSG00000283118
##   ENSG00000283125
## rowData names(5): feature_is_filtered feature_name feature_reference
##   feature_biotype feature_length
## colnames(255901): FCAImmP7179369-AAACCTGAGCCCAATT
##   FCAImmP7179369-AAACCTGAGCCTATGT ...
##   Human_colon_16S7985397-TTTGTCAAGCTGAACG
##   Human_colon_16S7985397-TTTGTCAGTATTAGCC
## colData names(16): Anno_level_1 Anno_level_2 ... Gender Source
## reducedDimNames(1): umap
## mainExpName: NULL
## altExpNames(0):
# switch rownames to gene symbols
# table(duplicated(rowData(sce_human_clean)$feature_name), useNA="ifany") # last sanity check: no duplicates :)
rownames(sce_human_clean) <- rowData(sce_human_clean)$feature_name

Let’s now convert into a seurat object.

# convert to seurat
seur_human <- Seurat::as.Seurat(sce_human_clean, data=NULL)

# remove genes which have 0 total count
seur_human <- seur_human[rowSums(seur_human)!=0,]

SCpubr::do_DimPlot(
  seur_human,
  group.by="Anno_level_fig1",
  legend.position="right"
)

2.3 Save Seurat object

saveRDS(seur_human, "./data_github/park_dataset/park_seurat_human.rds")

3 MURINE THYMUS ATLAS

3.1 Import data

The h5ad file was downloaded from the latest version of the zenodo repository of the paper (v1.0.2). Let’s import it as a SingleCellExperiment object once again, and convert it to a Seurat object.

sce_mouse <- zellkonverter::readH5AD("~/Projects/HumanThymusProject/data_github/park_dataset/HTA08_v02_A04_Science_mouse_total.h5ad")
print(sce_mouse)
## class: SingleCellExperiment 
## dim: 17996 36084 
## metadata(0):
## assays(1): X
## rownames(17996): 0610005C13Rik 0610009B22Rik ... Zzz3 a
## rowData names(0):
## colnames(36084): FCAImmP8084852-AAACCCACATAACGGG
##   FCAImmP8084852-AAACGAAAGCGGGTTA ...
##   GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1
##   ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1
## colData names(4): cell.types stage age sample_ID
## reducedDimNames(1): X_umap
## mainExpName: NULL
## altExpNames(0):
# check out metadata
colData(sce_mouse)
## DataFrame with 36084 rows and 4 columns
##                                          cell.types     stage      age
##                                            <factor>  <factor> <factor>
## FCAImmP8084852-AAACCCACATAACGGG          DP(Q)      postnatal       4W
## FCAImmP8084852-AAACGAAAGCGGGTTA          DP(Q)      postnatal       4W
## FCAImmP8084852-AAACGAACAGACGCTC          αβT(entry) postnatal       4W
## FCAImmP8084852-AAACGAACAGAGATTA          CD4+T      postnatal       4W
## FCAImmP8084852-AAACGAAGTACCCACG          DP(P)      postnatal       4W
## ...                                             ...       ...      ...
## AGATAGAGTACA-GSM2883197_E17_5_wholeThy_1      DN(Q)  prenatal      E17
## TTAAAATACTGA-GSM2883197_E17_5_wholeThy_1      DP(P)  prenatal      E17
## GTTTCTGCACTG-GSM2883197_E17_5_wholeThy_1      DN(Q)  prenatal      E17
## GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1      DP(P)  prenatal      E17
## ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1      DP(P)  prenatal      E17
##                                                            sample_ID
##                                                             <factor>
## FCAImmP8084852-AAACCCACATAACGGG                          MM_TH_4W_LI
## FCAImmP8084852-AAACGAAAGCGGGTTA                          MM_TH_4W_LI
## FCAImmP8084852-AAACGAACAGACGCTC                          MM_TH_4W_LI
## FCAImmP8084852-AAACGAACAGAGATTA                          MM_TH_4W_LI
## FCAImmP8084852-AAACGAAGTACCCACG                          MM_TH_4W_LI
## ...                                                              ...
## AGATAGAGTACA-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## TTAAAATACTGA-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## GTTTCTGCACTG-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
## ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1 GSM2883197_E17_5_wholeThy_1
# check umap and cell type annotation
plotReducedDim(sce_mouse, dimred="X_umap", colour_by="cell.types")

# check out gene names
table(duplicated(rownames(sce_mouse)), useNA="ifany") # none of the gene symbols are duplicated (ideal :)
## 
## FALSE 
## 17996

3.2 Create Seurat object

# create new SCE object
sce_mouse_clean <- SingleCellExperiment(
  list(counts=assay(sce_mouse)),
  colData=colData(sce_mouse),
  rowData=rowData(sce_mouse),
  reducedDims=list(umap=reducedDim(sce_mouse, "X_umap"))
)
print(sce_mouse_clean)
## class: SingleCellExperiment 
## dim: 17996 36084 
## metadata(0):
## assays(1): counts
## rownames(17996): 0610005C13Rik 0610009B22Rik ... Zzz3 a
## rowData names(0):
## colnames(36084): FCAImmP8084852-AAACCCACATAACGGG
##   FCAImmP8084852-AAACGAAAGCGGGTTA ...
##   GTCTTATGTGCG-GSM2883197_E17_5_wholeThy_1
##   ACTGAGATCAAT-GSM2883197_E17_5_wholeThy_1
## colData names(4): cell.types stage age sample_ID
## reducedDimNames(1): umap
## mainExpName: NULL
## altExpNames(0):
# convert to seurat
seur_mouse <- Seurat::as.Seurat(sce_mouse_clean, data=NULL)

# remove genes which have 0 total count
seur_mouse <- seur_mouse[rowSums(seur_mouse)!=0,]

# plot cell annotation
SCpubr::do_DimPlot(
  seur_mouse,
  group.by="cell.types",
  legend.position="right"
)

3.3 Save Seurat object

saveRDS(seur_mouse, "./data_github/park_dataset/park_seurat_mouse.rds")

4 SESSION INFO

sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats    graphics utils    stats4   methods  base    
## 
## other attached packages:
##  [1] SCpubr_2.0.2                scater_1.22.0              
##  [3] scuttle_1.4.0               SingleCellExperiment_1.16.0
##  [5] SummarizedExperiment_1.24.0 Biobase_2.54.0             
##  [7] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
##  [9] IRanges_2.28.0              S4Vectors_0.32.4           
## [11] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
## [13] matrixStats_1.0.0           SeuratObject_4.1.3         
## [15] Seurat_4.3.0.1              forcats_1.0.0              
## [17] stringr_1.5.0               dplyr_1.1.2                
## [19] purrr_1.0.1                 readr_2.1.4                
## [21] tidyr_1.3.0                 tibble_3.2.1               
## [23] tidyverse_1.3.2             ggplot2_3.4.2              
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                spatstat.explore_3.2-1   
##   [3] reticulate_1.30           tidyselect_1.2.0         
##   [5] htmlwidgets_1.6.2         grid_4.1.3               
##   [7] BiocParallel_1.28.3       Rtsne_0.16               
##   [9] zellkonverter_1.4.0       munsell_0.5.0            
##  [11] ScaledMatrix_1.2.0        codetools_0.2-19         
##  [13] ica_1.0-3                 future_1.33.0            
##  [15] miniUI_0.1.1.1            withr_2.5.0              
##  [17] spatstat.random_3.1-5     colorspace_2.1-0         
##  [19] progressr_0.13.0          filelock_1.0.2           
##  [21] highr_0.10                knitr_1.43               
##  [23] rstudioapi_0.14           ROCR_1.0-11              
##  [25] tensor_1.5                listenv_0.9.0            
##  [27] labeling_0.4.2            GenomeInfoDbData_1.2.7   
##  [29] polyclip_1.10-4           farver_2.1.1             
##  [31] rprojroot_2.0.3           basilisk_1.6.0           
##  [33] parallelly_1.36.0         vctrs_0.6.3              
##  [35] generics_0.1.3            xfun_0.39                
##  [37] timechange_0.2.0          R6_2.5.1                 
##  [39] ggbeeswarm_0.7.2          rsvd_1.0.5               
##  [41] gridGraphics_0.5-1        bitops_1.0-7             
##  [43] spatstat.utils_3.0-3      cachem_1.0.8             
##  [45] DelayedArray_0.20.0       assertthat_0.2.1         
##  [47] promises_1.2.0.1          scales_1.2.1             
##  [49] googlesheets4_1.1.1       beeswarm_0.4.0           
##  [51] gtable_0.3.3              beachmat_2.10.0          
##  [53] globals_0.16.2            goftest_1.2-3            
##  [55] rlang_1.1.1               splines_4.1.3            
##  [57] lazyeval_0.2.2            gargle_1.5.1             
##  [59] spatstat.geom_3.2-1       broom_1.0.5              
##  [61] yaml_2.3.7                reshape2_1.4.4           
##  [63] abind_1.4-5               modelr_0.1.11            
##  [65] backports_1.4.1           httpuv_1.6.11            
##  [67] tools_4.1.3               ggplotify_0.1.1          
##  [69] ellipsis_0.3.2            jquerylib_0.1.4          
##  [71] RColorBrewer_1.1-3        ggridges_0.5.4           
##  [73] Rcpp_1.0.10               plyr_1.8.8               
##  [75] sparseMatrixStats_1.6.0   zlibbioc_1.40.0          
##  [77] RCurl_1.98-1.12           basilisk.utils_1.6.0     
##  [79] deldir_1.0-9              pbapply_1.7-2            
##  [81] viridis_0.6.3             cowplot_1.1.1            
##  [83] zoo_1.8-12                haven_2.5.3              
##  [85] ggrepel_0.9.3             cluster_2.1.4            
##  [87] here_1.0.1                fs_1.6.2                 
##  [89] magrittr_2.0.3            data.table_1.14.8        
##  [91] scattermore_1.2           lmtest_0.9-40            
##  [93] reprex_2.0.2              RANN_2.6.1               
##  [95] googledrive_2.1.1         fitdistrplus_1.1-11      
##  [97] hms_1.1.3                 patchwork_1.1.2          
##  [99] mime_0.12                 evaluate_0.21            
## [101] xtable_1.8-4              readxl_1.4.2             
## [103] gridExtra_2.3             compiler_4.1.3           
## [105] KernSmooth_2.23-21        crayon_1.5.2             
## [107] htmltools_0.5.5           later_1.3.1              
## [109] tzdb_0.4.0                lubridate_1.9.2          
## [111] DBI_1.1.3                 dbplyr_2.3.2             
## [113] MASS_7.3-60               Matrix_1.5-4.1           
## [115] cli_3.6.3                 datasets_4.1.3           
## [117] grDevices_4.1.3           parallel_4.1.3           
## [119] igraph_1.5.0              pkgconfig_2.0.3          
## [121] dir.expiry_1.2.0          sp_2.0-0                 
## [123] plotly_4.10.2             spatstat.sparse_3.0-2    
## [125] xml2_1.3.4                vipor_0.4.5              
## [127] bslib_0.5.0               XVector_0.34.0           
## [129] rvest_1.0.3               yulab.utils_0.0.6        
## [131] digest_0.6.32             sctransform_0.3.5        
## [133] RcppAnnoy_0.0.21          spatstat.data_3.0-1      
## [135] rmarkdown_2.23            cellranger_1.1.0         
## [137] leiden_0.4.3              uwot_0.1.16              
## [139] DelayedMatrixStats_1.16.0 shiny_1.7.4              
## [141] lifecycle_1.0.3           nlme_3.1-162             
## [143] jsonlite_1.8.7            BiocNeighbors_1.12.0     
## [145] viridisLite_0.4.2         fansi_1.0.4              
## [147] pillar_1.9.0              lattice_0.21-8           
## [149] fastmap_1.1.1             httr_1.4.6               
## [151] survival_3.5-5            glue_1.6.2               
## [153] png_0.1-8                 stringi_1.7.12           
## [155] sass_0.4.6                BiocSingular_1.10.0      
## [157] irlba_2.3.5.1             future.apply_1.11.0